Install/Import Required Packages

suppressMessages({
  if(!require(knitr)){install.packages("knitr")}
  if(!require(ggplot2)){install.packages("ggplot2")}
  if(!require(plotly)){install.packages("plotly")}
  if(!require(rpart)){install.packages("rpart")}
  if(!require(rpart.plot)){install.packages("rpart.plot")}
  if(!require(randomForest)){install.packages("randomForest")}
  if(!require(gbm)){install.packages("gbm")}
  if(!require(caret)){install.packages("caret")}
  if(!require(factoextra)){install.packages("factoextra")}
  if(!require(FactoMineR)){install.packages("FactoMineR")}
  if (!require("pROC")) {install.packages("pROC")}
  if (!require("ggdendro")) {install.packages("ggdendro")}
  if (!require("mlbench")) {install.packages("mlbench")}
  if (!require("pheatmap", quietly = TRUE)) {install.packages("pheatmap")}
  if (!require("BiocManager", quietly = TRUE)){install.packages("BiocManager"); 
                                  BiocManager::install("GEOquery")}
  
  
  library(knitr)
  library(ggplot2)
  library(plotly)
  library(MASS)
  library(rpart)
  library(rpart.plot)
  library(randomForest)
  library(gbm)
  library(caret)
  library(factoextra)
  library(FactoMineR)
  library(pROC)
  library(ggdendro)
  library(mlbench)
  library(BiocManager)
  library(GEOquery)
  library(pheatmap)
  
})
## Warning: package 'BiocManager' was built under R version 4.4.3

Supervised Learning

Types of Supervised Learning

Regression

Classification

Supervised Learning Methods

We will go through three supervised learning methods in this session:

Linear Models

Linear Models - Regression: Linear Regression

Data Set: mtcars

**As a simple data set, we will use the “mtcars” data set in R. The following code can be used to display the data set.**

library(knitr)
data("mtcars")
kable(head(mtcars, 10),caption = "Motor Trend Car Road Tests")
Motor Trend Car Road Tests
mpg cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4

**Let’s Display the data set for Miles per Gallon (response) and Weight (predictor).**

mtcars_plot = ggplot(mtcars, aes(wt, mpg)) +
geom_point() + xlab('Weight (1000 lbs)') + ylab('Miles Per Gallon')

ggplotly(mtcars_plot)

Simple Linear Regression Model

R function lm is used model Linear Regression in R. Miles per Gallon (response) and Weight (predictor).

simple_linear_regression = lm(formula = mpg ~ wt ,data = mtcars)
summary(simple_linear_regression)
## 
## Call:
## lm(formula = mpg ~ wt, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5432 -2.3647 -0.1252  1.4096  6.8727 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  37.2851     1.8776  19.858  < 2e-16 ***
## wt           -5.3445     0.5591  -9.559 1.29e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.046 on 30 degrees of freedom
## Multiple R-squared:  0.7528, Adjusted R-squared:  0.7446 
## F-statistic: 91.38 on 1 and 30 DF,  p-value: 1.294e-10
library(ggplot2)
library(plotly)

linear_reg_gg = ggplot(mtcars, aes(wt, mpg))+ 
  geom_point() + geom_smooth(method="lm", color="red",se=FALSE) +
  geom_segment(aes(xend = wt, yend = (lm(mpg~wt,data = mtcars))$fitted), linetype="dashed") +
  xlim(0,5.5)+
  xlab('Weight (1000 lbs)') + ylab('Miles Per Gallon')

ggplotly(linear_reg_gg)
## `geom_smooth()` using formula = 'y ~ x'
Linear Regression Assumptions

Following plots is used to check the assumptions for the Linear Regression is Satisfied.

  • Linearity of Data (Residuals vs Fitted Plot)
  • Normality of Residuals (Normal Q-Q Plot)
  • Constant Variance of Residuals (Scale-Location Plot)
  • No Outliers (Cook’s Distance Plot)
layout(matrix(1:4, byrow = T, ncol = 2))
plot(simple_linear_regression, which = 1:4)

Multiple Linear Regression Model

Let’s add more input variables (or predictor variables) to the linear regression model.

\[ MPG = \beta_0 + \beta_1 Weight + \beta_2 ~ Cylinders + \beta_3 ~Rear\_axle\_Ratio + Error \]

multiple_linear_regression = lm(formula = mpg ~ wt + cyl + drat ,data = mtcars)
summary(multiple_linear_regression)
## 
## Call:
## lm(formula = mpg ~ wt + cyl + drat, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.2944 -1.5576 -0.4667  1.5678  6.1014 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  39.7677     6.8729   5.786 3.26e-06 ***
## wt           -3.1947     0.8293  -3.852 0.000624 ***
## cyl          -1.5096     0.4464  -3.382 0.002142 ** 
## drat         -0.0162     1.3231  -0.012 0.990317    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.613 on 28 degrees of freedom
## Multiple R-squared:  0.8302, Adjusted R-squared:  0.812 
## F-statistic: 45.64 on 3 and 28 DF,  p-value: 6.569e-11

Since the Rear axle ratio is not significance (p-value > 0.05), we can remove that variable and run the model again. \[ MPG = \beta_0 + \beta_1 Weight + \beta_2 ~ Cylinders + Error \]

multiple_linear_regression = lm(formula = mpg ~ wt + cyl ,data = mtcars)
summary(multiple_linear_regression)
## 
## Call:
## lm(formula = mpg ~ wt + cyl, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.2893 -1.5512 -0.4684  1.5743  6.1004 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  39.6863     1.7150  23.141  < 2e-16 ***
## wt           -3.1910     0.7569  -4.216 0.000222 ***
## cyl          -1.5078     0.4147  -3.636 0.001064 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.568 on 29 degrees of freedom
## Multiple R-squared:  0.8302, Adjusted R-squared:  0.8185 
## F-statistic: 70.91 on 2 and 29 DF,  p-value: 6.809e-12
Model Selection with Multiple Linear Regression Model

Full Model

full_model_linear_regression = lm(formula = mpg ~ . ,data = mtcars)
summary(full_model_linear_regression)
## 
## Call:
## lm(formula = mpg ~ ., data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4506 -1.6044 -0.1196  1.2193  4.6271 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 12.30337   18.71788   0.657   0.5181  
## cyl         -0.11144    1.04502  -0.107   0.9161  
## disp         0.01334    0.01786   0.747   0.4635  
## hp          -0.02148    0.02177  -0.987   0.3350  
## drat         0.78711    1.63537   0.481   0.6353  
## wt          -3.71530    1.89441  -1.961   0.0633 .
## qsec         0.82104    0.73084   1.123   0.2739  
## vs           0.31776    2.10451   0.151   0.8814  
## am           2.52023    2.05665   1.225   0.2340  
## gear         0.65541    1.49326   0.439   0.6652  
## carb        -0.19942    0.82875  -0.241   0.8122  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared:  0.869,  Adjusted R-squared:  0.8066 
## F-statistic: 13.93 on 10 and 21 DF,  p-value: 3.793e-07

Backward Elimination

This method starts with the full model and eliminates unimportant predictor variables from the model.

library(MASS)
backward_model = stepAIC(full_model_linear_regression, direction = "backward", trace = FALSE)
summary(backward_model)
## 
## Call:
## lm(formula = mpg ~ wt + qsec + am, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4811 -1.5555 -0.7257  1.4110  4.6610 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   9.6178     6.9596   1.382 0.177915    
## wt           -3.9165     0.7112  -5.507 6.95e-06 ***
## qsec          1.2259     0.2887   4.247 0.000216 ***
## am            2.9358     1.4109   2.081 0.046716 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.459 on 28 degrees of freedom
## Multiple R-squared:  0.8497, Adjusted R-squared:  0.8336 
## F-statistic: 52.75 on 3 and 28 DF,  p-value: 1.21e-11

Stepwise Method

This method starts with the one variable and add or removes predictor variables systematically.

library(MASS)
stepwise_model = stepAIC(full_model_linear_regression, direction = "both", trace = FALSE)
summary(stepwise_model)
## 
## Call:
## lm(formula = mpg ~ wt + qsec + am, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4811 -1.5555 -0.7257  1.4110  4.6610 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   9.6178     6.9596   1.382 0.177915    
## wt           -3.9165     0.7112  -5.507 6.95e-06 ***
## qsec          1.2259     0.2887   4.247 0.000216 ***
## am            2.9358     1.4109   2.081 0.046716 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.459 on 28 degrees of freedom
## Multiple R-squared:  0.8497, Adjusted R-squared:  0.8336 
## F-statistic: 52.75 on 3 and 28 DF,  p-value: 1.21e-11

Linear Models - Classification: Logistic Regression

Data Set: Heart Disease Data Set

In this section, heart disease (heart_df) data set is used to apply classification models.

cholesterol   = c(180,200,195,245,205,130,155,260,175,286,203,192, 256, 294)
age           = c(30,44,32,23,57,62,34,37,30,73,40,62, 37, 66)
heart_disease = as.numeric(c('0','1','1','1','1','0','0','1','0','1','0','1','1','1'))
heart_df = data.frame(cholesterol,age,heart_disease)

set.seed(123)
n_subjects = 50
heart_df = data.frame(
age = c(
    56,63,59,48,62,54,71,67,44,53,
    58,65,49,72,61,47,55,69,52,64,
    45,73,60,51,68,57,46,70,66,43,
    74,50,62,58,47,69,54,63,55,71,
    49,67,60,52,42,56,64,45,70,53
  ),
  cholesterol = c(
    253,184,245,205,218,171,240,202,168,188,
    205,230,176,245,215,190,196,238,185,222,
    165,250,208,182,236,200,172,242,228,160,
    255,178,217,204,171,210,250,223,198,244,
    174,231,209,184,247,199,221,166,201,187
  ),
  heart_disease = c(
    1,1,1,0,1,0,1,1,0,0,
    1,1,0,1,1,0,1,1,0,1,
    0,1,1,0,1,0,0,1,1,0,
    1,0,1,1,0,1,0,1,0,1,
    0,1,1,0,1,1,1,0,0,0
  ))

heart_df
##    age cholesterol heart_disease
## 1   56         253             1
## 2   63         184             1
## 3   59         245             1
## 4   48         205             0
## 5   62         218             1
## 6   54         171             0
## 7   71         240             1
## 8   67         202             1
## 9   44         168             0
## 10  53         188             0
## 11  58         205             1
## 12  65         230             1
## 13  49         176             0
## 14  72         245             1
## 15  61         215             1
## 16  47         190             0
## 17  55         196             1
## 18  69         238             1
## 19  52         185             0
## 20  64         222             1
## 21  45         165             0
## 22  73         250             1
## 23  60         208             1
## 24  51         182             0
## 25  68         236             1
## 26  57         200             0
## 27  46         172             0
## 28  70         242             1
## 29  66         228             1
## 30  43         160             0
## 31  74         255             1
## 32  50         178             0
## 33  62         217             1
## 34  58         204             1
## 35  47         171             0
## 36  69         210             1
## 37  54         250             0
## 38  63         223             1
## 39  55         198             0
## 40  71         244             1
## 41  49         174             0
## 42  67         231             1
## 43  60         209             1
## 44  52         184             0
## 45  42         247             1
## 46  56         199             1
## 47  64         221             1
## 48  45         166             0
## 49  70         201             0
## 50  53         187             0
heart_plot = ggplot(data = heart_df, mapping = aes(x = cholesterol, y = age, col = as.factor(heart_disease) )) +
    geom_point()+
    guides(color = guide_legend(title = "Heart Disease"))+
    xlab('Cholestoral') + ylab('Age in Years')
ggplotly(heart_plot)

library(ggplot2)

heart_plot = ggplot(data = heart_df, mapping = aes(x = cholesterol, y = heart_disease)) +
    geom_point()+
    geom_smooth(method = "glm", method.args = list(family = "binomial"), se = FALSE) +
    xlab('Cholesterol') + ylab('Heart Disease')

ggplotly(heart_plot)
## `geom_smooth()` using formula = 'y ~ x'

\[ Logit\{Y\}=\beta_0+\beta_1~Cholesterol\] \[ Y= \frac{exp(\beta_0+\beta_1~Cholesterol)}{1+exp(\beta_0+\beta_1~Cholesterol)}\]

logistic_regression = glm(formula = heart_disease ~ cholesterol  ,data = heart_df,family = 'binomial')
summary(logistic_regression)
## 
## Call:
## glm(formula = heart_disease ~ cholesterol, family = "binomial", 
##     data = heart_df)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -19.82515    5.50299  -3.603 0.000315 ***
## cholesterol   0.09940    0.02745   3.621 0.000293 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 68.029  on 49  degrees of freedom
## Residual deviance: 34.196  on 48  degrees of freedom
## AIC: 38.196
## 
## Number of Fisher Scoring iterations: 6

\[ Logit\{Y\}=\beta_0+\beta_1~Cholesterol + \beta_2 Age\] \[ Y= \frac{exp(\beta_0+\beta_1~Cholesterol+ \beta_2 Age)}{1+exp(\beta_0+\beta_1~Cholesterol+ \beta_2 Age)}\]

logistic_regression = glm(formula = heart_disease ~ cholesterol + age ,
                          data = heart_df,family = 'binomial')

summary(logistic_regression)
## 
## Call:
## glm(formula = heart_disease ~ cholesterol + age, family = "binomial", 
##     data = heart_df)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -25.27279    7.08238  -3.568 0.000359 ***
## cholesterol   0.06902    0.02652   2.603 0.009249 ** 
## age           0.20247    0.07907   2.561 0.010449 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 68.029  on 49  degrees of freedom
## Residual deviance: 26.894  on 47  degrees of freedom
## AIC: 32.894
## 
## Number of Fisher Scoring iterations: 6

Get odd ratios from parameters estimates:

exp(coef(logistic_regression))
##  (Intercept)  cholesterol          age 
## 1.057226e-11 1.071458e+00 1.224426e+00

Predict for new set of data with Logistic Regression:

  • New Data 1: A person with Age = 55 and cholesterol level 150.
  • New Data 2: A person with Age = 75 and cholesterol level 190
predict(logistic_regression, 
        newdata = data.frame(cholesterol = c(150, 190), age = c(55,75)), 
        type = "response")
##          1          2 
## 0.02223167 0.95375721
  • Probability of getting a heart disease for a person with age 25 years and cholesterol level of 150 is 0.02%.
  • Probability of getting a heart disease for a person with age 65 years and cholesterol level of 250 is 100 %.

Bagging Methods

Regression Trees

Partition the data set into decision boundaries.

library(rpart)
library(rpart.plot)
ctrl = list(cp = 0.00001, minbucket = 1, maxdepth = 2)
tree = rpart(mpg ~ wt+cyl, data = mtcars,control = ctrl, method = 'anova')
rpart.plot(tree)

library(ggplot2)
ggplot(mtcars, aes(y=wt, x=cyl, color=mpg)) +
geom_point() + 
geom_hline(yintercept=2.3, linetype="dashed", 
                color = "red", linewidth=1) + 
  geom_vline(xintercept=7, linetype="dashed", 
                color = "purple", linewidth=1)

Bagging Method: Random Forest Regression

R code for Random Forest Regression

library(randomForest)
Random_Forest_model = randomForest(formula = mpg ~ wt+cyl+drat, data = mtcars, ntree=500, importance=TRUE)
  • Get important features/variables from the model
important_features = as.data.frame(importance(Random_Forest_model)) 

important_features$feature_name = row.names(important_features)

ggplot(important_features, aes(x=feature_name, y=`%IncMSE`)) + 
  geom_segment( aes(x=feature_name, xend=feature_name, y=0, yend=`%IncMSE`), color="orange") +
  geom_point(aes(size = IncNodePurity), color="darkorange", alpha=0.9) + 
  theme_minimal() + 
  coord_flip() + 
  theme( legend.position="None") + 
  xlab('Features') + ylab('Importance')

Bagging Method: Random Forest Classification

library(randomForest)

Random_Forest_Classifier = randomForest(as.factor(heart_disease) ~ cholesterol + age, data=heart_df, ntree=500, importance = TRUE)
important_features = as.data.frame(importance(Random_Forest_Classifier)) 

important_features$feature_name = row.names(important_features)

ggplot(important_features, aes(x=feature_name, y=`MeanDecreaseGini`)) + 
  geom_segment( aes(x=feature_name, xend=feature_name, y=0, yend=`MeanDecreaseGini`), color="orange") +
  geom_point(aes(size = MeanDecreaseAccuracy), color="darkorange", alpha=0.9) + 
  theme_minimal() + 
  coord_flip() + 
  theme(legend.position = "none") +
  xlab('Features') + ylab('Importance')

Predict New Data with Random Forest:

predict( Random_Forest_Classifier, 
         data.frame(cholesterol = c(150, 190), age = c(55,75)),type = "prob")
##       0     1
## 1 0.726 0.274
## 2 0.388 0.612
## attr(,"class")
## [1] "matrix" "array"  "votes"
  • A person with age 55 years and cholesterol level of 150 is predicted to be not getting an heart disease

  • A person with age 75 years and cholesterol level of 190 is predicted to be getting an heart disease

  • Probability of getting a heart disease for a person with age 55 years and cholesterol level of 150 is \(\approx 82\%\).

  • Probability of getting a heart disease for a person with age 75 years and cholesterol level of 190 is 99 %.

Boosting Methods

Boosting Method

Boosting Method: Gradient Boosting Regression

library(gbm)
set.seed(123)
Gradient_Boost_Regression = gbm(
  formula = mpg ~ wt + cyl + drat,
  data = mtcars,
  n.trees = 2000,
  n.minobsinnode = 5
)
## Distribution not specified, assuming gaussian ...
summary(Gradient_Boost_Regression)

##       var  rel.inf
## wt     wt 52.83806
## drat drat 32.11113
## cyl   cyl 15.05080

Boosting Method: Gradient Boost Classification

set.seed(123)
Gradient_Boost_Classifier = gbm(heart_disease ~ cholesterol + age, 
                                data=heart_df,
                                n.trees = 2000)
## Distribution not specified, assuming bernoulli ...
summary(Gradient_Boost_Classifier)

##                     var  rel.inf
## cholesterol cholesterol 50.28782
## age                 age 49.71218
test_data = data.frame(cholesterol = c(150, 190), age = c(55,75))
pred_gbm = predict( object = Gradient_Boost_Classifier, newdata = test_data, 
                        n.trees = Gradient_Boost_Classifier$n.trees, type="response")
pred_gbm
## [1] 0.8239065 0.9999572
  • Probability of getting a heart disease for a person with age 55 years and cholesterol level of 150 is \(\approx 82\%\).
  • Probability of getting a heart disease for a person with age 75 years and cholesterol level of 190 is 99 %.

Model Evaluation

Evaluation of Regression Models

Evaluation of Classification Models

Confusion matrix, Sensitivity and Specificity

library(caret)
random_forest_prediction = predict(Random_Forest_Classifier)
observed_data = as.factor(heart_df$heart_disease)
confusionMatrix( random_forest_prediction, observed_data,positive = '1')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 17  5
##          1  4 24
##                                           
##                Accuracy : 0.82            
##                  95% CI : (0.6856, 0.9142)
##     No Information Rate : 0.58            
##     P-Value [Acc > NIR] : 0.0002834       
##                                           
##                   Kappa : 0.633           
##                                           
##  Mcnemar's Test P-Value : 1.0000000       
##                                           
##             Sensitivity : 0.8276          
##             Specificity : 0.8095          
##          Pos Pred Value : 0.8571          
##          Neg Pred Value : 0.7727          
##              Prevalence : 0.5800          
##          Detection Rate : 0.4800          
##    Detection Prevalence : 0.5600          
##       Balanced Accuracy : 0.8186          
##                                           
##        'Positive' Class : 1               
## 

ROC Curve

library(pROC)
roc_logistic = roc(heart_df$heart_disease, as.numeric(predict(logistic_regression)),smoothed = TRUE,plot=TRUE, grid=TRUE,print.auc=TRUE)

roc_random_forest = roc(heart_df$heart_disease, as.numeric(random_forest_prediction),smoothed = TRUE,plot=TRUE, grid=TRUE,print.auc=TRUE)

#plot(roc_logistic, main = "ROC Curve")
#lines(roc_random_forest,col='blue')

Over-fitting & Cross-Validation

Train-test Split

Let’s use mtcars data set to apply train-test split. We split as train data set will have 70% and test data set will have 30%.

set.seed(1234)
sample = sample(c(TRUE, FALSE), nrow(mtcars), replace=TRUE, prob=c(0.7,0.3))
mtcars_train  = mtcars[sample, ]
mtcars_test   = mtcars[!sample, ]

Train Data Set Size

dim(mtcars_train)
## [1] 26 11

Test Data Set Size

dim(mtcars_test)
## [1]  6 11

K-Fold Cross Validation

Apply k-fold cross validation for mtcars training data set.

Cross Validation with Linear Regression

library(caret)

cross_validate_setup = trainControl(method = "cv", number = 4) # Cross validation with 4 folds

linear_regression_cv = train(mpg ~ .,
                     data = mtcars_train,                        
                     trControl = cross_validate_setup,           
                     method = "lm",
                     metric = 'RMSE'       # RMSE is used to compare the model
                     )

linear_regression_cv
## Linear Regression 
## 
## 26 samples
## 10 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (4 fold) 
## Summary of sample sizes: 19, 19, 20, 20 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   3.765965  0.7585956  2.974284
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

Cross Validation with Random Forest Regression

library(caret)

cross_validate_setup = trainControl(method = "cv", number = 4) # Cross validation with 4 folds

random_forest_regression_cv = train(mpg ~ .,
                     data = mtcars_train,                        
                     trControl = cross_validate_setup,           
                     method = "rf",                      # Random Forest
                     metric = 'RMSE',
                     tuneGrid = expand.grid(.mtry = c(1 : 10)), #Grid Search with mtry from 1 to 10
                     na.action = na.pass)

random_forest_regression_cv
## Random Forest 
## 
## 26 samples
## 10 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (4 fold) 
## Summary of sample sizes: 18, 21, 20, 19 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE     
##    1    2.685053  0.8574958  2.111840
##    2    2.426804  0.8694192  1.875677
##    3    2.376424  0.8723288  1.751273
##    4    2.366303  0.8671525  1.705216
##    5    2.340886  0.8646725  1.654538
##    6    2.316080  0.8670054  1.647052
##    7    2.386482  0.8635178  1.702981
##    8    2.315735  0.8655307  1.658174
##    9    2.344355  0.8587729  1.674820
##   10    2.283882  0.8637430  1.647277
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 10.

Random Forest Regression with mtry = 10 is the best model compared to the Linear Regression as the RMSE is the lowest.

Unsupervised Learning

K-Means Clustering

Lets apply K-Means Clustering for mtcars data set

set.seed(123)

k_means_clustering = kmeans(mtcars, centers = 2)

print(k_means_clustering$cluster)
##           Mazda RX4       Mazda RX4 Wag          Datsun 710      Hornet 4 Drive 
##                   1                   1                   1                   1 
##   Hornet Sportabout             Valiant          Duster 360           Merc 240D 
##                   2                   1                   2                   1 
##            Merc 230            Merc 280           Merc 280C          Merc 450SE 
##                   1                   1                   1                   2 
##          Merc 450SL         Merc 450SLC  Cadillac Fleetwood Lincoln Continental 
##                   2                   2                   2                   2 
##   Chrysler Imperial            Fiat 128         Honda Civic      Toyota Corolla 
##                   2                   1                   1                   1 
##       Toyota Corona    Dodge Challenger         AMC Javelin          Camaro Z28 
##                   1                   2                   2                   2 
##    Pontiac Firebird           Fiat X1-9       Porsche 914-2        Lotus Europa 
##                   2                   1                   1                   1 
##      Ford Pantera L        Ferrari Dino       Maserati Bora          Volvo 142E 
##                   2                   1                   2                   1

Plot K-Means Clustering using factoextra library

library(factoextra)
fviz_cluster(k_means_clustering, data = mtcars)

The following two methods can be used to check best number of clusters.

fviz_nbclust(mtcars, kmeans, method = "silhouette")
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
##   Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
##   Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
##   Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

fviz_nbclust(mtcars, kmeans, method = "wss")

Hierarchical Clustering

Lets apply Hierarchical-Means Clustering for mtcars data set

library(ggdendro)
set.seed(123)
mtcars_distance = dist(mtcars, method = 'euclidean')
h_clustering = hclust(mtcars_distance)
ggdendrogram(h_clustering)

Hierarchical-Means Clustering based heat-maps are popular in analyzing gene expression data

Data Source: https://bioconductor.org/packages/devel/data/experiment/html/celldex.html

library(BiocManager)
library(GEOquery)
library(pheatmap)

gse = getGEO("GSE33126")

dim((exprs(gse[[1]])))
## [1] 48803    18
gene_exp = t(exprs(gse[[1]]))
data_subset = as.matrix(gene_exp[,colSums(gene_exp)>500000])
dist = dist(t(data_subset) , diag=TRUE)
hc   = hclust(dist)
dhc  = as.dendrogram(hc)
#ggdendrogram(hc)
pheatmap(scale(data_subset))

PCA Biplot

Principal components reduce the data dimensions and allows to visualize the correlation structure of the data effectively

library(FactoMineR)

pca_mtcars = PCA(mtcars,  graph = FALSE)

fviz_pca_biplot(pca_mtcars, repel = TRUE)